Load packages

library(tidyverse) ## Always load tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext) ## Needed for text analysis
library(rstudioapi) ## set working directory
library(knitr) ## print nice tables
library(plotly) ## Interactive plots
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(RColorBrewer) ## Color palettes

Environment prep

## Suppress messages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

## Set directory to wherever this script lives
setwd(dirname(getActiveDocumentContext()$path))

Load overall product info

## List csv files
utils::unzip("sephora_data.zip", list = TRUE)$Name
## [1] "product_info.csv"     "reviews_0-250.csv"    "reviews_1250-end.csv"
## [4] "reviews_250-500.csv"  "reviews_500-750.csv"  "reviews_750-1250.csv"
## Load product info
product_info <- read_csv(unz("sephora_data.zip", "product_info.csv")) %>%
  select(product_id, reviews, primary_category, secondary_category, tertiary_category) %>% 
  mutate(category = ifelse(is.na(tertiary_category) == TRUE, secondary_category,
                           tertiary_category))

Which categories have the most reviews?

product_info %>% 
  group_by(category) %>% 
  summarize(reviews = sum(reviews, na.rm = TRUE)) %>% 
  arrange(desc(reviews)) %>% 
  head(10) %>% 
  kable()
category reviews
Foundation 224222
Mascara 214671
Moisturizers 205380
Perfume 188490
Face Serums 173771
Eyeliner 123261
Face Wash & Cleansers 121357
Makeup 114262
Eyebrow 113709
Mini Size 106088

Load individual reviews

reviews <- utils::unzip("sephora_data.zip", list = TRUE)$Name %>%
  str_subset("^reviews_.*\\.csv$") %>%
  map_dfr(~ read_csv(unz("sephora_data.zip", .x), 
                     show_col_types = FALSE,
                     col_select = c(product_id, product_name, brand_name,
                                    submission_time, rating, is_recommended,
                                    review_title, review_text))) %>% 
  mutate(year = as.numeric(substr(submission_time, 1, 4))) %>%
  distinct() %>% 
  inner_join(product_info, by = "product_id")

Recount categories with highest number of reviews

reviews %>% 
  group_by(category) %>% 
  summarize(reviews = n()) %>% 
  arrange(desc(reviews)) %>% 
  head(10) %>% 
  kable()
category reviews
Moisturizers 206032
Face Serums 174482
Face Wash & Cleansers 121672
Mini Size 85465
Eye Creams & Treatments 70418
Face Masks 66818
Lip Balms & Treatments 61602
Face Oils 41844
Face Sunscreen 39079
Toners 35846

Make files for type

## Pick subtype
subtype_reviews <- reviews %>% filter(category == "Moisturizers")

## Merge the title of the review with the actual review, and unnest tokens
tokens <- subtype_reviews %>%
  unite("review_title_and_test", review_title, review_text, sep = " ") %>%
  unnest_tokens(word, review_title_and_test) 

## Count use of each token, and average across ratings/recommendation given
token_counts <- tokens %>%
  group_by(word) %>% 
  summarise(n = n(),
            average_rating = mean(rating, na.rm=TRUE), 
            average_recommendation = mean(is_recommended, na.rm=TRUE)) %>%
  arrange(-n) %>% 
  filter(n>200)

## Group word use by year
token_counts_year <- tokens %>%
  group_by(word, year) %>%
  summarise(n = n(),
            average_rating = mean(rating, na.rm = TRUE),
            average_recommendation = mean(is_recommended, na.rm = TRUE)) %>% 
  arrange(-n)%>% 
  filter(n>20) %>% ## keep words used more than 20x per year
  group_by(word) %>%
  filter(n() > 10) %>% ## keep words that had substantial use more than half of the years
  ungroup()

Look at distribution of word counts

token_counts %>% 
  ggplot(aes(n)) + 
  geom_histogram(fill="cornflowerblue") + 
  geom_vline(aes(xintercept = mean(n, na.rm=TRUE)), lty = 2) +
  geom_vline(aes(xintercept = median(n, na.rm=TRUE))) +
  scale_x_log10(labels = function(x) format(x, scientific = FALSE)) +
  labs(title = "Word Counts", subtitle = "solid line = median; dotted line = mean")+
  theme_classic()

Look at how key words are distributed accross average rating

Good sanity check to see if desirable words float to the top, etc.

token_counts %>%
  anti_join(get_stopwords()) %>%
  filter(str_detect(word, "[A-Za-z]")) %>% 
  filter(nchar(word) > 1) %>% 
  filter(n > 100) %>% 
  ggplot(aes(n, average_rating)) +
  geom_text(aes(label = word), check_overlap = TRUE, show.legend = FALSE, vjust = "top", hjust = "left") +
  scale_x_log10(labels = function(x) format(x, scientific = FALSE)) +
  theme_classic()

Run linear models and extract results

linear_slopes <- token_counts_year %>%
  group_by(word) %>%
  do(model = lm(average_rating ~ year, data = .)) %>%
  filter(!is.null(model)) %>%
  summarize(word = word[1],
            slope = coef(model)[2],
            intercept = coef(model)[1], 
            significance = ifelse(nrow(summary(model)$coefficients) >= 2 & 
                                  ncol(summary(model)$coefficients) >= 4,
                                  summary(model)$coefficients[2, 4], NA)) %>%
  ungroup()

Select words with highest and lowest linear coefficients

In other words, which keywords have the largest decline or increase in associated rating?

declining_words <- linear_slopes %>% 
  arrange(slope) %>% 
  head(n = 5) %>%
  select(word, slope)

ascending_words <- linear_slopes %>% 
  arrange(slope) %>% 
  tail(n = 5) %>%
  select(word, slope)

shifting_words <- rbind(declining_words, ascending_words) %>% 
  arrange(-slope)

Time to plot!

## Set up some nice colors
colour_map <- set_names(rev(colorRampPalette(brewer.pal(10, "RdYlGn"))(10)),
                        shifting_words$word)

## Make sure plot legend stays in order of coefs (not alphabetical)
word_order <- shifting_words$word

## Set up plot
token_trajectories <- shifting_words %>%
  inner_join(token_counts_year) %>% 
  mutate(word = factor(word, levels = word_order)) %>% 
  ggplot(aes(x = year, y = average_rating, group = word, color = word)) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_manual(values = colour_map) +
  scale_x_continuous(breaks = c(2008,2010,2012,2014,2016,2018,2020,2022))+
  scale_y_continuous(limits = c(1,5),
                     breaks = c(1,2,3,4,5))+
  labs(x = "Year",
       y = "Average Rating",
       color = "Keyword",
       title = "Trends in Ratings of Moisturizers") +
  theme_bw()

## Make interactive
ggplotly(token_trajectories)

Let’s take a closer look at one of our keywords

subtype_reviews %>% 
  filter(year >= 2022 & 
           rating == 5 &
           str_detect(review_text, "packaging")) %>% 
  sample_n(3) %>% 
  select(review_text) %>% 
  kable()
review_text
This moisturizer is incredible for dry skin! I’ve always had super dry skin and live in a very cold dry place which doesn’t help and this product has been incredible! I use this at night after cleansing and using the hyaluronic acid serum from the same brand/line and they work incredibly together to hydrate my skin! I absolutely love these products and as a bonus I love the green packaging!
Very luxe, treat yourself! This crème fondante is as luxurious as the name sounds. It’s thick, hydrating, soft, buttery and absorbs into your skin with no greasy after effect. It really feels amazing on your skin and super hydrating. It has a soft scent that seems to dissipate quickly. While this may seem pricey, if you are looking for something to hydrate, firm, give a glow, and plump your skin this is it! The packaging is very luxe too. I am completely pleased with this crème. I received this product from Bzzagent for an honest review and I feel this is honest. If you want a luxurious crème to treat yourself and splurge, a little goes a long way, get this!
First of all, beautiful packaging! The moisturizer leaves my skin feeling moisturized after application while also leaving a slight glow. The texture of the product is a bit thick hence why I use it for my night time skin care routine. Overall, I love this product.